library(readr)
library(ggplot2)
library(forcats)
library(corrplot)
## corrplot 0.92 loaded
library(gridExtra)
library(RColorBrewer)
data <- read_csv("train.csv")
## New names:
## • `` -> `...1`
## Rows: 103904 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Gender, Customer Type, Type of Travel, Class, satisfaction
## dbl (20): ...1, id, Age, Flight Distance, Inflight wifi service, Departure/A...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 25
## ...1 id Gender `Customer Type` Age `Type of Travel` Class
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 0 70172 Male Loyal Customer 13 Personal Travel Eco Plus
## 2 1 5047 Male disloyal Customer 25 Business travel Business
## 3 2 110028 Female Loyal Customer 26 Business travel Business
## 4 3 24026 Female Loyal Customer 25 Business travel Business
## 5 4 119299 Male Loyal Customer 61 Business travel Business
## 6 5 111157 Female Loyal Customer 26 Personal Travel Eco
## # ℹ 18 more variables: `Flight Distance` <dbl>, `Inflight wifi service` <dbl>,
## # `Departure/Arrival time convenient` <dbl>, `Ease of Online booking` <dbl>,
## # `Gate location` <dbl>, `Food and drink` <dbl>, `Online boarding` <dbl>,
## # `Seat comfort` <dbl>, `Inflight entertainment` <dbl>,
## # `On-board service` <dbl>, `Leg room service` <dbl>,
## # `Baggage handling` <dbl>, `Checkin service` <dbl>,
## # `Inflight service` <dbl>, Cleanliness <dbl>, …
summary(data)
## ...1 id Gender Customer Type
## Min. : 0 Min. : 1 Length:103904 Length:103904
## 1st Qu.: 25976 1st Qu.: 32534 Class :character Class :character
## Median : 51952 Median : 64856 Mode :character Mode :character
## Mean : 51952 Mean : 64924
## 3rd Qu.: 77927 3rd Qu.: 97368
## Max. :103903 Max. :129880
##
## Age Type of Travel Class Flight Distance
## Min. : 7.00 Length:103904 Length:103904 Min. : 31
## 1st Qu.:27.00 Class :character Class :character 1st Qu.: 414
## Median :40.00 Mode :character Mode :character Median : 843
## Mean :39.38 Mean :1189
## 3rd Qu.:51.00 3rd Qu.:1743
## Max. :85.00 Max. :4983
##
## Inflight wifi service Departure/Arrival time convenient Ease of Online booking
## Min. :0.00 Min. :0.00 Min. :0.000
## 1st Qu.:2.00 1st Qu.:2.00 1st Qu.:2.000
## Median :3.00 Median :3.00 Median :3.000
## Mean :2.73 Mean :3.06 Mean :2.757
## 3rd Qu.:4.00 3rd Qu.:4.00 3rd Qu.:4.000
## Max. :5.00 Max. :5.00 Max. :5.000
##
## Gate location Food and drink Online boarding Seat comfort
## Min. :0.000 Min. :0.000 Min. :0.00 Min. :0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:2.000
## Median :3.000 Median :3.000 Median :3.00 Median :4.000
## Mean :2.977 Mean :3.202 Mean :3.25 Mean :3.439
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.00 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000
##
## Inflight entertainment On-board service Leg room service Baggage handling
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :3.358 Mean :3.382 Mean :3.351 Mean :3.632
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
##
## Checkin service Inflight service Cleanliness Departure Delay in Minutes
## Min. :0.000 Min. :0.00 Min. :0.000 Min. : 0.00
## 1st Qu.:3.000 1st Qu.:3.00 1st Qu.:2.000 1st Qu.: 0.00
## Median :3.000 Median :4.00 Median :3.000 Median : 0.00
## Mean :3.304 Mean :3.64 Mean :3.286 Mean : 14.82
## 3rd Qu.:4.000 3rd Qu.:5.00 3rd Qu.:4.000 3rd Qu.: 12.00
## Max. :5.000 Max. :5.00 Max. :5.000 Max. :1592.00
##
## Arrival Delay in Minutes satisfaction
## Min. : 0.00 Length:103904
## 1st Qu.: 0.00 Class :character
## Median : 0.00 Mode :character
## Mean : 15.18
## 3rd Qu.: 13.00
## Max. :1584.00
## NA's :310
sum(is.na(data$`Arrival Delay in Minutes`))
## [1] 310
data$`Arrival Delay in Minutes`[is.na(data$`Arrival Delay in Minutes`)] <- median(data$`Arrival Delay in Minutes`, na.rm = TRUE)
#duplicate
sum(duplicated(data))
## [1] 0
categorical_vars_ggplot <- c('Gender', '`Customer Type`', '`Type of Travel`', 'Class', 'satisfaction')
plot_list <- list()
for (cat_var in categorical_vars_ggplot) {
plot_obj <- ggplot(data, aes_string(x = cat_var, fill = cat_var)) +
geom_bar() +
geom_text(stat='count', aes_string(label='..count..', y='..count..'), vjust=-0.5) +
labs(title = paste("Distribution of", cat_var), x = cat_var, y = "Count") +
scale_fill_brewer(palette="Set3") +
theme_minimal() +
theme(legend.position="none")
plot_list[[cat_var]] <- plot_obj
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
grid.arrange(grobs = plot_list, ncol = 2)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
numerical_vars_ggplot <- c('Age', '`Flight Distance`', '`Inflight wifi service`', '`Departure/Arrival time convenient`', '`Ease of Online booking`', '`Gate location`', '`Food and drink`', '`Online boarding`', '`Seat comfort`', '`Inflight entertainment`', '`On-board service`', '`Leg room service`', '`Baggage handling`', '`Checkin service`', '`Inflight service`', 'Cleanliness', '`Departure Delay in Minutes`', '`Arrival Delay in Minutes`')
library(ggplot2)
barplot_vars <- c()
histogram_vars <- c()
for (num_var in numerical_vars_ggplot) {
unique_vals <- length(unique(data[[gsub("`", "", num_var)]]))
if (unique_vals < 10) {
barplot_vars <- c(barplot_vars, num_var)
} else {
histogram_vars <- c(histogram_vars, num_var)
}
}
barplot_vars
## [1] "`Inflight wifi service`" "`Departure/Arrival time convenient`"
## [3] "`Ease of Online booking`" "`Gate location`"
## [5] "`Food and drink`" "`Online boarding`"
## [7] "`Seat comfort`" "`Inflight entertainment`"
## [9] "`On-board service`" "`Leg room service`"
## [11] "`Baggage handling`" "`Checkin service`"
## [13] "`Inflight service`" "Cleanliness"
histogram_vars
## [1] "Age" "`Flight Distance`"
## [3] "`Departure Delay in Minutes`" "`Arrival Delay in Minutes`"
for (num_var in numerical_vars_ggplot) {
unique_vals <- length(unique(data[[gsub("`", "", num_var)]]))
if (unique_vals < 10) {
plot_obj <- ggplot(data, aes_string(x = num_var)) +
geom_bar() +
geom_text(stat='count', aes_string(label='..count..', y='..count..'), vjust=-0.5) +
labs(title = paste("Distribution of", num_var), x = num_var, y = "Frequency")+
scale_fill_brewer(palette="Set3") +
theme_minimal() +
theme(legend.position="none")
} else {
plot_obj <- ggplot(data, aes_string(x = num_var)) +
geom_histogram(bins = 40, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = paste("Distribution of", num_var), x = num_var, y = "Frequency")
}
print(plot_obj)
}
continuous_vars <- c('Age', '`Flight Distance`', '`Departure Delay in Minutes`', '`Arrival Delay in Minutes`')
continuous_plots <- list()
for (num_var in continuous_vars) {
plot_obj <- ggplot(data, aes_string(x = num_var)) +
geom_histogram(bins = 40, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = paste("Distribution of", num_var), x = num_var, y = "Frequency") +
theme_minimal()
continuous_plots <- append(continuous_plots, list(plot_obj))
}
if (length(continuous_plots) > 0) {
grid.arrange(grobs = continuous_plots, ncol = 2)
}
rating_vars <- c('`Inflight wifi service`', '`Departure/Arrival time convenient`', '`Ease of Online booking`', '`Gate location`', '`Food and drink`', '`Online boarding`', '`Seat comfort`', '`Inflight entertainment`', '`On-board service`', '`Leg room service`', '`Baggage handling`', '`Checkin service`', '`Inflight service`', 'Cleanliness')
rating_plots <- list()
for (rate_var in rating_vars) {
plot_obj <- ggplot(data, aes_string(x = rate_var)) +
geom_bar(aes(fill = get(rate_var))) +
geom_text(stat='count', aes_string(label='..count..', y='..count..'), vjust=0, size = 20) +
labs(title = paste("Distribution of", rate_var), x = rate_var, y = "Frequency")+
scale_fill_brewer(palette="Set3") +
theme_minimal() +
theme(legend.position="none",
plot.title = element_text(size = 54, face = "bold"),
axis.title.x = element_text(size = 52, face = "bold"),
axis.title.y = element_text(size = 52, face = "bold"),
axis.text.x = element_text(size = 50),
axis.text.y = element_text(size = 50))
rating_plots <- append(rating_plots, list(plot_obj))
}
if (length(rating_plots) > 0) {
grid.arrange(grobs = rating_plots, ncol = 2)
}
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
numerical_vars_correlation <- gsub("`", "", numerical_vars_ggplot)
correlation_matrix <- cor(data[numerical_vars_correlation])
options(repr.plot.width=100, repr.plot.height=80)
corrplot(correlation_matrix, method = "color", type = "upper", order = "hclust",
tl.col = "black", tl.srt = 90, addCoef.col = "black", number.cex = 0.9,
cl.cex = 0.9)
selected_data <- data[c("Age", "Flight Distance", "Departure Delay in Minutes", "Arrival Delay in Minutes")]
outliers_counts_selected <- sapply(colnames(selected_data), function(var) {
Q1 <- quantile(selected_data[[var]], 0.25, na.rm = TRUE)
Q3 <- quantile(selected_data[[var]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliers <- which((selected_data[[var]] < lower_bound) |
(selected_data[[var]] > upper_bound))
length(outliers)
})
outliers_counts_selected
## Age Flight Distance
## 0 2291
## Departure Delay in Minutes Arrival Delay in Minutes
## 14529 13954
Age: No outliers, so no action needed.
Flight Distance: Given that it’s plausible for some flights to have longer distances, capping might be a better approach than outright removal.
Departure/Arrival Delay in Minutes: Delays can vary significantly, with occasional very long delays. Instead of removing these values, it might be more beneficial to apply a log transformation to reduce the skewness and impact of extreme values.
Note : We’ll add 1 before applying the log transformation to handle instances with a delay of 0 minutes.
Q1_fd <- quantile(data$`Flight Distance`, 0.25)
Q3_fd <- quantile(data$`Flight Distance`, 0.75)
IQR_fd <- Q3_fd - Q1_fd
lower_cap_fd <- Q1_fd - 1.5 * IQR_fd
upper_cap_fd <- Q3_fd + 1.5 * IQR_fd
data$`Flight Distance` <- pmin(pmax(data$`Flight Distance`, lower_cap_fd), upper_cap_fd)
data$`Departure Delay in Minutes` <- log1p(data$`Departure Delay in Minutes`)
data$`Arrival Delay in Minutes` <- log1p(data$`Arrival Delay in Minutes`)
head(data[c("Flight Distance", "Departure Delay in Minutes", "Arrival Delay in Minutes")])
## # A tibble: 6 × 3
## `Flight Distance` `Departure Delay in Minutes` `Arrival Delay in Minutes`
## <dbl> <dbl> <dbl>
## 1 460 3.26 2.94
## 2 235 0.693 1.95
## 3 1142 0 0
## 4 562 2.48 2.30
## 5 214 0 0
## 6 1180 0 0
categorical_features <- c('Gender', '`Customer Type`', '`Type of Travel`', 'Class')
par(mfrow=c(2,2), mar=c(4,4,2,2))
for (feature in categorical_features) {
p <- ggplot(data, aes_string(x=feature, fill='satisfaction')) +
geom_bar(position="dodge") +
geom_text(stat='count', aes(label=..count..), vjust=-0.5, position=position_dodge(width=0.9)) +
labs(title = paste("Distribution of", gsub("`", "", feature), "by Satisfaction"), x = gsub("`", "", feature), y = "Count") +
scale_fill_brewer(palette="Set3") +
theme_minimal() +
theme(legend.position="top")
print(p)
}
Gender: Both genders have a fairly similar distribution of satisfaction levels. Customer Type: Loyal customers tend to be more satisfied than disloyal ones. Type of Travel: Passengers traveling for business purposes are generally more satisfied than those traveling for personal reasons. Class: Business class passengers are noticeably more satisfied than those in Eco or Eco Plus.
numerical_features <- c('Age', '`Flight Distance`', '`Departure Delay in Minutes`', '`Arrival Delay in Minutes`')
par(mfrow=c(2,2), mar=c(4,4,2,2))
for (feature in numerical_features) {
p <- ggplot(data, aes_string(x=feature, fill='satisfaction')) +
geom_density(alpha=0.5, position="identity") +
labs(title = paste("Distribution of", feature, "by Satisfaction"), x = feature, y = "Density") +
scale_fill_manual(values=c("satisfied"="green", "neutral or dissatisfied"="red")) +
theme_minimal() +
theme(legend.position="top")
print(p)
}
Age: Younger passengers tend to be more neutral or dissatisfied, while older passengers lean more towards satisfaction.
Flight Distance: Passengers traveling shorter distances seem to be more neutral or dissatisfied compared to those traveling longer distances.
Departure Delay in Minutes: Although the distributions overlap considerably, there’s a slightly higher density of neutral or dissatisfied passengers with longer departure delays.
Arrival Delay in Minutes: Similar to departure delays, passengers with longer arrival delays tend to be more neutral or dissatisfied.
# just 5000 rows
data_sample <- data[sample(nrow(data), 5000), ]
# Scatter plot
ggplot(data_sample, aes(x=Age, y=`Flight Distance`, color=satisfaction)) +
geom_point(alpha=0.7) +
scale_color_manual(values=c("neutral or dissatisfied"="red", "satisfied"="green")) +
labs(title="Relationship between Age and Flight Distance by Satisfaction") +
theme_minimal()
There’s a spread of satisfied and neutral/dissatisfied customers across various ages and flight distances.
Older passengers who travel longer distances seem to be predominantly satisfied.
Younger passengers, especially those traveling shorter distances, display a mix of satisfaction levels.
Null Hypothesis : There is no association between the categorical variable and the satisfaction of passengers. alternative hypothesis : There is a statistically significant association between the categorical variable and the satisfaction of passengers.
categorical_vars <- c('Gender', 'Customer Type', 'Type of Travel', 'Class')
perform_chi2_test <- function(feature) {
contingency_table <- table(data[[feature]], data$satisfaction)
chi2_test_result <- chisq.test(contingency_table)
return(chi2_test_result$p.value)
}
chi2_p_values <- sapply(categorical_vars, perform_chi2_test)
names(chi2_p_values) <- categorical_vars
chi2_p_values
## Gender Customer Type Type of Travel Class
## 8.496755e-05 0.000000e+00 0.000000e+00 0.000000e+00
For all these variables, the p-values are extremely small, indicating that there is a statistically significant association between the categorical variable and the target variable satisfaction.
Given the extremely small p-values for all the categorical variables, we can reject the null hypothesis for each of them. This implies that there’s a statistically significant association between each of these categorical variables (Gender, Customer Type, Type of Travel, and Class) and passenger satisfaction.
In simpler terms, the likelihood of a passenger being satisfied (or not) is not independent of their gender, customer type, type of travel, or class. Each of these factors plays a role in determining their satisfaction level.
Null Hypothesis : There is no difference in the means of “Arrival Delay in Minutes” between the two satisfaction groups (satisfied and neutral or dissatisfied).
alternative hypothesis : There is a significant difference in the means of “Arrival Delay in Minutes” between the two satisfaction groups.
group1_arrival_delay <- data$`Arrival Delay in Minutes`[data$satisfaction == "satisfied"]
group2_arrival_delay <- data$`Arrival Delay in Minutes`[data$satisfaction == "neutral or dissatisfied"]
t_test_result_arrival_delay <- t.test(group1_arrival_delay, group2_arrival_delay)
t_test_result_arrival_delay
##
## Welch Two Sample t-test
##
## data: group1_arrival_delay and group2_arrival_delay
## t = -33.162, df = 100286, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3528694 -0.3134851
## sample estimates:
## mean of x mean of y
## 1.072264 1.405441